read_delim("../data/Files/ASV_table") -> bacteria
## New names:
## Rows: 659 Columns: 33173
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): ...1 dbl (33172): ASV0001, ASV0002, ASV0003, ASV0004, ASV0005, ASV0006,
## ASV0007, ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
dim(bacteria) # 659 by 33173
## [1] 659 33173
head(bacteria)
## # A tibble: 6 × 33,173
## ...1 ASV0001 ASV0002 ASV0003 ASV0004 ASV0005 ASV0006 ASV0007 ASV0008 ASV0009
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SMAQ-… 18882 0 0 320 8742 6211 5201 0 4396
## 2 SMAQ-… 5569 0 0 2200 20914 14087 1817 0 1671
## 3 SMAQ-… 2910 0 126 881 24521 15744 1010 0 865
## 4 SMAQ-… 21262 0 0 486 12111 7410 6763 0 5930
## 5 SMAQ-… 14855 0 0 1842 28347 13910 4342 0 3876
## 6 SMAQ-… 44453 0 0 734 3263 1787 12608 0 10949
## # ℹ 33,163 more variables: ASV0010 <dbl>, ASV0011 <dbl>, ASV0012 <dbl>,
## # ASV0013 <dbl>, ASV0014 <dbl>, ASV0015 <dbl>, ASV0016 <dbl>, ASV0017 <dbl>,
## # ASV0018 <dbl>, ASV0019 <dbl>, ASV0020 <dbl>, ASV0022 <dbl>, ASV0023 <dbl>,
## # ASV0024 <dbl>, ASV0025 <dbl>, ASV0027 <dbl>, ASV0030 <dbl>, ASV0031 <dbl>,
## # ASV0032 <dbl>, ASV0033 <dbl>, ASV0034 <dbl>, ASV0035 <dbl>, ASV0036 <dbl>,
## # ASV0037 <dbl>, ASV0039 <dbl>, ASV0040 <dbl>, ASV0041 <dbl>, ASV0042 <dbl>,
## # ASV0043 <dbl>, ASV0044 <dbl>, ASV0046 <dbl>, ASV0047 <dbl>, …
tail(bacteria)
## # A tibble: 6 × 33,173
## ...1 ASV0001 ASV0002 ASV0003 ASV0004 ASV0005 ASV0006 ASV0007 ASV0008 ASV0009
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 PFAR-… 0 75 54828 164 0 0 0 0 0
## 2 PFAR-… 0 3231 38694 127 0 0 0 981 0
## 3 PFAR-… 0 2128 40708 119 0 0 0 653 0
## 4 PFAR-… 0 16 6886 35 0 0 0 5 0
## 5 PFAR-… 0 113 18970 0 0 0 0 0 0
## 6 PFAR-… 0 3512 64518 196 0 0 0 1184 0
## # ℹ 33,163 more variables: ASV0010 <dbl>, ASV0011 <dbl>, ASV0012 <dbl>,
## # ASV0013 <dbl>, ASV0014 <dbl>, ASV0015 <dbl>, ASV0016 <dbl>, ASV0017 <dbl>,
## # ASV0018 <dbl>, ASV0019 <dbl>, ASV0020 <dbl>, ASV0022 <dbl>, ASV0023 <dbl>,
## # ASV0024 <dbl>, ASV0025 <dbl>, ASV0027 <dbl>, ASV0030 <dbl>, ASV0031 <dbl>,
## # ASV0032 <dbl>, ASV0033 <dbl>, ASV0034 <dbl>, ASV0035 <dbl>, ASV0036 <dbl>,
## # ASV0037 <dbl>, ASV0039 <dbl>, ASV0040 <dbl>, ASV0041 <dbl>, ASV0042 <dbl>,
## # ASV0043 <dbl>, ASV0044 <dbl>, ASV0046 <dbl>, ASV0047 <dbl>, …
read_delim("../data/Files/16S_tax.txt") -> bacteria_tax
## Rows: 33172 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (8): ASV, Kingdom, Phylum, Class, Order, Family, Genus, Species
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(bacteria_tax)
## [1] 33172 8
names(bacteria_tax)
## [1] "ASV" "Kingdom" "Phylum" "Class" "Order" "Family" "Genus"
## [8] "Species"
bacteria_tax %>%
group_by(Species) %>%
summarise(n = n()) %>%
arrange(-n)
## # A tibble: 226 × 2
## Species n
## <chr> <int>
## 1 <NA> 32904
## 2 normanense 4
## 3 aquimaris 3
## 4 brevis 3
## 5 denitrificans 3
## 6 fermentans 3
## 7 flava 3
## 8 spongiae 3
## 9 accolens 2
## 10 aerolata 2
## # ℹ 216 more rows
33173 - 32904
## [1] 269
bacteria_tax %>%
group_by(Genus) %>%
summarise(n = n()) %>%
arrange(-n)
## # A tibble: 1,390 × 2
## Genus n
## <chr> <int>
## 1 <NA> 16482
## 2 Endozoicomonas 597
## 3 Kistimonas 244
## 4 OM27_clade 243
## 5 Flavobacterium 208
## 6 Bdellovibrio 200
## 7 Lewinella 199
## 8 Coxiella 194
## 9 MD3_55 185
## 10 Tenacibaculum 175
## # ℹ 1,380 more rows
bacteria_tax %>%
group_by(Family) %>%
summarise(n = n()) %>%
arrange(-n)
## # A tibble: 482 × 2
## Family n
## <chr> <int>
## 1 <NA> 5115
## 2 Flavobacteriaceae 2261
## 3 Rhodobacteraceae 1708
## 4 Saprospiraceae 1480
## 5 Cyclobacteriaceae 915
## 6 Endozoicomonadaceae 878
## 7 Mitochondria 683
## 8 Alteromonadaceae 565
## 9 Vibrionaceae 465
## 10 Bdellovibrionaceae 446
## # ℹ 472 more rows
bacteria_tax %>%
group_by(Order) %>%
summarise(n = n()) %>%
arrange(-n)
## # A tibble: 308 × 2
## Order n
## <chr> <int>
## 1 Flavobacteriales 3628
## 2 Chitinophagales 2068
## 3 Cytophagales 1987
## 4 Rickettsiales 1743
## 5 Rhodobacterales 1708
## 6 Oceanospirillales 1498
## 7 Burkholderiales 1209
## 8 Alteromonadales 1170
## 9 Bacteroidales 1107
## 10 Rhizobiales 798
## # ℹ 298 more rows
bacteria_tax %>%
filter(Order=="Flavobacteriales")
## # A tibble: 3,628 × 8
## ASV Kingdom Phylum Class Order Family Genus Species
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 ASV0087 Bacteria Bacteroidota Bacteroidia Flavobacteria… Weeks… Chry… massil…
## 2 ASV0112 Bacteria Bacteroidota Bacteroidia Flavobacteria… Weeks… Cloa… norman…
## 3 ASV0120 Bacteria Bacteroidota Bacteroidia Flavobacteria… Flavo… Mari… <NA>
## 4 ASV0134 Bacteria Bacteroidota Bacteroidia Flavobacteria… Flavo… Nonl… <NA>
## 5 ASV0152 Bacteria Bacteroidota Bacteroidia Flavobacteria… Flavo… Kord… <NA>
## 6 ASV0154 Bacteria Bacteroidota Bacteroidia Flavobacteria… Flavo… <NA> <NA>
## 7 ASV0168 Bacteria Bacteroidota Bacteroidia Flavobacteria… Flavo… NS5_… <NA>
## 8 ASV0172 Bacteria Bacteroidota Bacteroidia Flavobacteria… Flavo… NS2b… <NA>
## 9 ASV0185 Bacteria Bacteroidota Bacteroidia Flavobacteria… Flavo… NS4_… <NA>
## 10 ASV0228 Bacteria Bacteroidota Bacteroidia Flavobacteria… Flavo… <NA> <NA>
## # ℹ 3,618 more rows
bacteria %>%
rename(sample_id = `...1`) -> bacteria
# take one ASV to be explored
#bacteria$ASV0001
# overview stats of ASV0001 counts (abundance)
summary(bacteria$ASV0001)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 949 8892 7632 136200
# how many rows are 0?
sum(bacteria$ASV0001==0) /nrow(bacteria) # 199 out of 659 samples = 30.2 % are 0 values
## [1] 0.3019727
plot(bacteria$ASV0001)
Citation:
DOI: 10.1111/mec.16871
# obtain top 20 names per the March publication
top_20_asv <- names(bacteria[2:21]) %>% as_vector()
bacteria %>%
# only need Top 20 to see quick distribution of each
select(top_20_asv) %>%
map(., ~plot(.))
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(top_20_asv)
##
## # Now:
## data %>% select(all_of(top_20_asv))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## $ASV0001
## NULL
##
## $ASV0002
## NULL
##
## $ASV0003
## NULL
##
## $ASV0004
## NULL
##
## $ASV0005
## NULL
##
## $ASV0006
## NULL
##
## $ASV0007
## NULL
##
## $ASV0008
## NULL
##
## $ASV0009
## NULL
##
## $ASV0010
## NULL
##
## $ASV0011
## NULL
##
## $ASV0012
## NULL
##
## $ASV0013
## NULL
##
## $ASV0014
## NULL
##
## $ASV0015
## NULL
##
## $ASV0016
## NULL
##
## $ASV0017
## NULL
##
## $ASV0018
## NULL
##
## $ASV0019
## NULL
##
## $ASV0020
## NULL
bacteria %>%
# only need Top 20 to see quick distribution of each
select(top_20_asv[1:10]) %>%
GGally::ggpairs() -> ggpairs_top_10_asv
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
bacteria %>%
# only need Top 20 to see quick distribution of each
select(top_20_asv[11:20]) %>%
GGally::ggpairs() -> ggpairs_top_11_20_asv
ggsave("ggpairs_top_10_asv.png", ggpairs_top_10_asv, device = "png", path = "../plots")
## Saving 7 x 5 in image
ggsave("ggpairs_top_11_20_asv.png", ggpairs_top_11_20_asv, device = "png", path = "../plots")
## Saving 7 x 5 in image
# for one random sample id:
bacteria %>%
filter(sample_id=="PDOG-R1-1") %>%
pivot_longer(cols = starts_with("ASV"),
names_to = "ASV",
values_to = "count_ASV") %>%
# how many unique ASVs are there for this sample?
# need a 0 vs non-0 binary count column
mutate(zero = ifelse(count_ASV==0, "T", "F")) %>%
# group by the new binary column to see how many ASVs this sample has that are non-0
group_by(zero) %>%
summarise(n = n())
## # A tibble: 2 × 2
## zero n
## <chr> <int>
## 1 F 203
## 2 T 32969
32969 + 203
## [1] 33172
bacteria %>%
# begin pivot for all samples to switch ASV columns into one long column with its ASV name and its corresponding abundance counts all in one other column
# this will create a very long data frame but is needed to see the zero vs non-zero counts
pivot_longer(cols = starts_with("ASV"),
names_to = "ASV",
values_to = "count_ASV") %>%
# creating new column for simple grouping
mutate(zero = ifelse(count_ASV==0, "T", "F")) %>% # 21,860,348 rows
# before had only pivoted for one sample but below will group by sample and zero or not
group_by(sample_id, zero) %>%
summarise(n = n()) %>% # yes, 1,318 groups is 659 x 2 for T/F in each
# order by n to see which samples have the most T=zero counts
arrange(-n) -> unique_bacteria_counts_per_sample
## `summarise()` has grouped output by 'sample_id'. You can override using the
## `.groups` argument.
unique_bacteria_counts_per_sample %>%
arrange(sample_id)
## # A tibble: 1,318 × 3
## # Groups: sample_id [659]
## sample_id zero n
## <chr> <chr> <int>
## 1 PDOG-R1-1 T 32969
## 2 PDOG-R1-1 F 203
## 3 PDOG-R1-12 T 32959
## 4 PDOG-R1-12 F 213
## 5 PDOG-R1-13 T 32901
## 6 PDOG-R1-13 F 271
## 7 PDOG-R1-14 T 32933
## 8 PDOG-R1-14 F 239
## 9 PDOG-R1-16 T 32910
## 10 PDOG-R1-16 F 262
## # ℹ 1,308 more rows
# simple base R plot
plot(unique_bacteria_counts_per_sample$n)
theme_set(theme_bw())
unique_bacteria_counts_per_sample %>%
ggplot(aes(x = n, fill = sample_id)) +
geom_histogram(show.legend = F)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
unique_bacteria_counts_per_sample %>%
filter(zero=="F") %>% # F group means ASV count was greater than 0
ggplot(aes(x = n)) +
geom_histogram(show.legend = F) +
ggtitle("Historgram Distribution of How Many Unique ASVs", "per Coral Sample ID")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
unique_bacteria_counts_per_sample %>%
ungroup() %>%
filter(zero=="F") %>% # F group means ASV count was greater than 0
# calculate mean of unique ASVs to have text on boxplot
mutate(mean_n = mean(n)) %>%
ggplot(aes(x = n, y = zero)) +
xlab("Unique ASVs per Sample ID") +
geom_boxplot(show.legend = F) +
stat_summary(fun.y=mean, geom="point", shape=20, size=5, color="red", fill="red") +
ggtitle("Boxplot Distribution of How Many Unique ASVs ", "Average per Coral Sample ID: 204")
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
unique_bacteria_counts_per_sample %>%
ungroup() %>%
filter(zero=="F") %>%
summary()
## sample_id zero n
## Length:659 Length:659 Min. : 14.0
## Class :character Class :character 1st Qu.:119.0
## Mode :character Mode :character Median :180.0
## Mean :204.3
## 3rd Qu.:263.0
## Max. :862.0
unique_bacteria_counts_per_sample %>%
ungroup() %>% # grouping doesn't matter for graphing
filter(zero=="T") %>% # T group means ASV count was 0
ggplot(aes(x = n)) +
geom_histogram(show.legend = F) +
ggtitle("Historgram Distribution of How Many 0 Counts for ASV", "per Coral Sample ID")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
unique_bacteria_counts_per_sample %>%
ungroup() %>%
filter(zero=="T") %>%
mutate(mean_n = mean(n)) %>%
ggplot(aes(x = n, y = zero)) +
xlab("Count of 0 for ASV per Sample ID") +
geom_boxplot(show.legend = F) +
stat_summary(fun.y=mean, geom="point", shape=20, size=5, color="orange", fill="orange") +
ggtitle("Boxplot Distribution of How Many 0 Counts of ASV", "Average per Coral Sample ID: ______")
unique_bacteria_counts_per_sample %>%
ungroup() %>%
filter(zero=="T") %>%
summary()
## sample_id zero n
## Length:659 Length:659 Min. :32310
## Class :character Class :character 1st Qu.:32909
## Mode :character Mode :character Median :32992
## Mean :32968
## 3rd Qu.:33053
## Max. :33158